Code
library(tidyverse)
library(patchwork)
library(ggbeeswarm)
library(ggnewscale)
library(RColorBrewer)
library(hexbin)
library(ggExtra)Libraries
library(tidyverse)
library(patchwork)
library(ggbeeswarm)
library(ggnewscale)
library(RColorBrewer)
library(hexbin)
library(ggExtra)Paths
metadata_path <-
"data/processed/metadata_ashton_desj_all_weavepop_H99.csv"
chromosomes_path <-
"../Crypto_Desjardins_Ashton/results/02.Dataset/chromosomes.csv"
chrom_lengths_path <-
"data/processed/chromosome_lengths.tsv"
depth_by_chrom_good_desj_path <-
"../Crypto_Desjardins/results/04.Intermediate_files/02.Dataset/depth_quality/depth_by_chrom_good.tsv"
depth_by_chrom_good_ashton_path <-
"../Crypto_Ashton/results/04.Intermediate_files/02.Dataset/depth_quality/depth_by_chrom_good.tsv"
cnv_calls_path <-
"../Crypto_Desjardins_Ashton/results/02.Dataset/cnv/cnv_calls.tsv"
duplications_out_path <-
"results/tables/duplications_putative.tsv"
repeats_path_prefix <-
"../Crypto_Desjardins/results/03.References/"Use the metadata table that has all the samples included in the final Crypto_Desjardins_Ashton dataset (n = 1055) and H99.
Select needed columns, remove H99 and get the number of samples per dataset and lineage, per lineage, and total.
metadata <- read.delim(
metadata_path,
header=TRUE,
sep=",",
stringsAsFactor = TRUE)
metadata <- metadata %>%
select(sample, strain, source, lineage, dataset, vni_subdivision)%>%
filter(!strain == "H99") %>%
group_by(dataset, lineage)%>%
mutate(samples_in_dataset_lineage = n_distinct(sample))%>%
ungroup() %>%
group_by(lineage)%>%
mutate(samples_in_lineage = n_distinct(sample))%>%
ungroup()%>%
mutate(total_samples = n_distinct(sample))%>%
droplevels()Get the nice chromosome names from the chromosomes file in WeavePop.
chromosome_names = read.delim(
chromosomes_path,
header=TRUE, sep=",")
chromosome_names <- chromosome_names %>%
mutate(chromosome = str_pad(chromosome, 2, pad = "0"))%>%
mutate(chromosome = as.factor(chromosome))
levels(chromosome_names$chromosome) <- paste("chr", chromosome_names$chromosome, sep="")Get the chromosome lengths. Create the file with bash.
# /usr/bin/bash
tail -n +2 ../Crypto_Desjardins/config/chromosomes.csv | \
cut -d',' -f1 | sort | uniq | while read line
do
seqkit fx2tab ../Crypto_Desjardins/data/references/$line.fasta -l -i -n >> data/processed/chromosome_lengths.tsv
donechromosome_lengths = read.delim(
chrom_lengths_path,
header=FALSE,
col.names=c("accession", "length"),
sep="\t")Import the file with all called CNVs and add the chromosome names.
cnv_calls <- read.delim(
cnv_calls_path,
header=TRUE, sep="\t")%>%
left_join(chromosome_names, by="accession")depth_threshold <- 0.6
del_threshold <- 1 - depth_threshold
dup_threshold <- 1 + depth_thresholdcnv_summary <- cnv_calls %>%
group_by(cnv) %>%
summarize(
count = n(),
min_norm_depth = min(norm_depth, na.rm = TRUE),
mean_norm_depth = mean(norm_depth, na.rm = TRUE),
q25_norm_depth = quantile(norm_depth, probs = 0.25, na.rm = TRUE),
median_norm_depth = median(norm_depth, na.rm = TRUE),
q75_norm_depth = quantile(norm_depth, probs = 0.75, na.rm = TRUE),
max_norm_depth = max(norm_depth, na.rm = TRUE),
min_smooth_depth = min(smooth_depth, na.rm = TRUE),
mean_smooth_depth = mean(smooth_depth, na.rm = TRUE),
q25_smooth_depth = quantile(smooth_depth, probs = 0.25, na.rm = TRUE),
median_smooth_depth = median(smooth_depth, na.rm = TRUE),
q75_smooth_depth = quantile(smooth_depth, probs = 0.75, na.rm = TRUE),
max_smooth_depth = max(smooth_depth, na.rm = TRUE),
min_region_size = min(region_size, na.rm = TRUE),
mean_region_size = mean(region_size, na.rm = TRUE),
q25_region_size = quantile(region_size, probs = 0.25, na.rm = TRUE),
median_region_size = median(region_size, na.rm = TRUE),
q75_region_size = quantile(region_size, probs = 0.75, na.rm = TRUE),
max_region_size = max(region_size, na.rm = TRUE),
)
cnv_summary| cnv | count | min_norm_depth | mean_norm_depth | q25_norm_depth | median_norm_depth | q75_norm_depth | max_norm_depth | min_smooth_depth | mean_smooth_depth | q25_smooth_depth | median_smooth_depth | q75_smooth_depth | max_smooth_depth | min_region_size | mean_region_size | q25_region_size | median_region_size | q75_region_size | max_region_size |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| deletion | 49641 | 0 | 0.0743412 | 0.00 | 0.01 | 0.08 | 6.82 | 0.00 | 0.0973373 | 0.00 | 0.04 | 0.15 | 0.49 | 3 | 14582.99 | 6500 | 10000 | 19994 | 538500 |
| duplication | 9356 | 0 | 6.3060400 | 1.53 | 1.62 | 1.94 | 113.87 | 1.51 | 6.1987730 | 1.53 | 1.60 | 1.87 | 111.11 | 8 | 18550.87 | 3500 | 8000 | 14000 | 948000 |
There are a lot more deletions than duplications.
Get the percentage of each chromosome covered by repeats to know how much of a chromosome might not have reliable CNV calls.
lineages <- unique(metadata$lineage)
repeats_all <- data.frame()
for(lineage in lineages){
repeats_path <-paste(
repeats_path_prefix,
lineage, "/", lineage, "_repeats.bed",
sep ="")
repeats <- read.delim(repeats_path,
header=FALSE,
col.names=c("accession", "start", "end", "repeat_type"),
sep="\t")
repeats$lineage <- lineage
repeats_all <- rbind(repeats_all, repeats)
}repeats_percent <- repeats_all %>%
mutate(repeat_size_each = end - start)%>%
group_by(accession, lineage) %>%
summarise(repeat_size = sum(repeat_size_each)) %>%
left_join(chromosome_lengths, by="accession") %>%
left_join(chromosome_names, by=c("accession","lineage")) %>%
mutate(percent_repeat_size = round((repeat_size / length) * 100, 2))%>%
mutate(chromosome = as.factor(chromosome))%>%
select(lineage, accession, chromosome, percent_repeat_size)Most chromosomes have repeats in between 5 and 10% of the size. In VNBII it is closer to 15% for some chormosomes.
d <- ggplot(cnv_calls, aes(repeat_fraction, fill = cnv)) +
facet_wrap(~cnv, ncol = 1, scales = "free")+
geom_histogram(binwidth = 0.01)+
scale_x_continuous(labels = scales::comma)+
theme_minimal() +
theme(legend.position = "none")+
labs(title = "Histogram of Repeat Fraction of CNVs",
subtitle = paste("Binwidth: 0.01"),
y = "Count",
x = "Repeat Fraction")
d In deletions, more of them have high repetitive fraction. In duplications, most of them have low repetitive fraction.
The Normalized depth (norm_depth) is the median of the normalized mean depth of the windows that are part of the CNV region.
The Smooth normalized depth (smooth_depth) is the median of the smooth normalized mean depth of the windows that are part of the CNV region.
d <- ggplot(cnv_calls, aes(norm_depth, fill = cnv)) +
facet_wrap(~cnv, ncol = 1)+
geom_histogram(binwidth = 0.1)+
scale_y_log10(labels = scales::comma) +
theme_minimal() +
theme(legend.position = "none")+
labs(title = "Histogram of Normalized Depth",
y = "Count (Log 10)",
x = "Normalized Depth")
s <- ggplot(cnv_calls, aes(smooth_depth, fill = cnv)) +
facet_wrap(~cnv, ncol = 1)+
geom_histogram(binwidth = 0.1)+
scale_y_log10(labels = scales::comma) +
theme_minimal() +
theme(legend.position = "none")+
labs(title = "Histogram of Smooth Normalized Depth",
y = "Count (Log 10)",
x = "Smooth Normalized Depth")
d | sThere are CNVs with extremely high normalized depth. I will truncate the normalized depth to 5.
d <- ggplot(cnv_calls, aes(norm_depth, fill = cnv)) +
facet_wrap(~cnv, ncol = 1)+
geom_histogram(binwidth = 0.01)+
geom_vline(xintercept = del_threshold)+
geom_vline(xintercept = dup_threshold)+
scale_y_log10(labels = scales::comma) +
xlim(0,5)+
theme_minimal() +
theme(legend.position = "none")+
labs(title = "Histogram of Normalized Depth",
subtitle = paste("Lines in depth threshold:", depth_threshold),
y = "Count (Log 10)",
x = "Normalized Depth (truncated)")
s <- ggplot(cnv_calls, aes(smooth_depth, fill = cnv)) +
facet_wrap(~cnv, ncol = 1)+
geom_histogram(binwidth = 0.01)+
geom_vline(xintercept = del_threshold)+
geom_vline(xintercept = dup_threshold)+
scale_y_log10(labels = scales::comma) +
xlim(0,5)+
theme_minimal() +
theme(legend.position = "none")+
labs(title = "Histogram of Smooth Normalized Depth",
subtitle = paste("Lines in depth threshold:", depth_threshold),
y = "Count (Log 10)",
x = "Smooth Normalized Depth (truncated)")
d | sMost deletions have depth (normalized and smooth) close to 0. Most duplications have depth (normalized and smooth) close to the depth threshold with which CNVs were called.
p <- ggplot(cnv_calls, aes(x = norm_depth, y = smooth_depth)) +
geom_abline(slope = 1, intercept = 0, linetype = "solid", color = "lightgray") +
geom_hex(bins = 50) +
facet_wrap(~cut(repeat_fraction, breaks = 4), nrow = 2) +
scale_fill_viridis_c(name = "Count\nLog10", direction = -1, trans = "log10") +
scale_x_continuous(labels = scales::comma) +
scale_y_continuous(labels = scales::comma) +
theme_minimal() +
labs(title = "Hexbin Plot of Normalized Depth vs Smooth Normalized Depth",
subtitle = "Panels by interval of Repeat Fraction",
x = "Normalized Depth",
y = "Smooth Normalized Depth")
o <- ggplot(cnv_calls, aes(x = norm_depth, y = smooth_depth)) +
geom_abline(slope = 1, intercept = 0, linetype = "solid", color = "lightgray") +
geom_hex(bins = 50) +
facet_wrap(~cut(repeat_fraction, breaks = 4), nrow = 2) +
scale_x_continuous(labels = scales::comma, limits = c(0,5)) +
scale_y_continuous(labels = scales::comma, limits = c(0,5)) +
scale_fill_viridis_c(name = "Count\nLog10", direction = -1, trans = "log10") +
theme_minimal() +
labs(title = "Hexbin Plot of Normalized Depth vs Smooth Normalized Depth (Truncated axes)",
subtitle = "Panels by interval of Repeat Fraction",
x = "Normalized Depth",
y = "Smooth Normalized Depth")
p / oSince the CNVs are called using the smooth_depth, but it does not always coincide with the norm_depth, there are called CNVs in between the depth threshold used in the CNV calling. Most CNVs are in the diagonal where smooth depth is the same as normalized depth.
There are many duplications with extremely high depth that are in the same high interval of repetitive fraction.
There are more deletion in the high repetitive fraction interval. There are more duplications in the low repetitive fraction interval.
d <- ggplot(cnv_calls, aes(region_size, fill = cnv)) +
facet_wrap(~cnv, ncol = 1)+
geom_histogram(binwidth = 1000)+
scale_x_continuous(labels = scales::comma)+
scale_y_log10(labels = scales::comma) +
theme_minimal() +
theme(legend.position = "none")+
labs(title = "Histogram of Sizes of CNVs",
subtitle = paste("Binwidth: 1000\nLength of largest chromosome", scales::comma(max(chromosome_lengths$length))),
y = "Count (Log10)",
x = "Size of CNVs")
d Only duplication are larger than ~150kb.
p <- ggplot(cnv_calls, aes(x = region_size, y = smooth_depth)) +
geom_abline(slope = 1, intercept = 0, linetype = "solid", color = "lightgray") +
geom_hex(bins = 50) +
facet_wrap(~cut(repeat_fraction, breaks = 4), nrow = 2) +
scale_fill_viridis_c(name = "Count\nLog10", direction = -1, trans = "log10") +
scale_x_continuous(labels = scales::comma) +
scale_y_continuous(labels = scales::comma) +
theme_minimal() +
labs(title = "Hexbin Plot of Smooth Normalized Depth vs CNV Size",
subtitle = "Panels by interval of Repeat Fraction",
x = "CNV Size",
y = "Smooth Normalized Depth")
o <- ggplot(cnv_calls, aes(x = region_size, y = smooth_depth)) +
geom_abline(slope = 1, intercept = 0, linetype = "solid", color = "lightgray") +
geom_hex(bins = 50) +
facet_wrap(~cut(repeat_fraction, breaks = 4), nrow = 2) +
scale_x_continuous(labels = scales::comma) +
scale_y_continuous(labels = scales::comma, limits = c(0,5)) +
scale_fill_viridis_c(name = "Count\nLog10", direction = -1, trans = "log10") +
theme_minimal() +
labs(title = "Hexbin Plot of Smooth Normalized Depth vs CNV Size (Truncated Y axis)",
subtitle = "Panels by interval of Repeat Fraction",
x = "CNV Size",
y = "Smooth Normalized Depth")
p / oThere are many small CNVs in a high repetitive fraction interval that have extremely high depth.
p <- ggplot(cnv_calls, aes(x = region_size, y = norm_depth)) +
geom_abline(slope = 1, intercept = 0, linetype = "solid", color = "lightgray") +
geom_hex(bins = 50) +
facet_wrap(~cut(repeat_fraction, breaks = 4), nrow = 2) +
scale_fill_viridis_c(name = "Count\nLog10", direction = -1, trans = "log10") +
scale_x_continuous(labels = scales::comma) +
scale_y_continuous(labels = scales::comma) +
theme_minimal() +
labs(title = "Hexbin Plot of Normalized Depth vs CNV Size",
subtitle = "Panels by interval of Repeat Fraction",
x = "CNV Size",
y = "Normalized Depth")
o <- ggplot(cnv_calls, aes(x = region_size, y = norm_depth)) +
geom_abline(slope = 1, intercept = 0, linetype = "solid", color = "lightgray") +
geom_hex(bins = 50) +
facet_wrap(~cut(repeat_fraction, breaks = 4), nrow = 2) +
scale_fill_viridis_c(name = "Count\nLog10", direction = -1, trans = "log10") +
scale_x_continuous(labels = scales::comma) +
scale_y_continuous(labels = scales::comma, limits = c(0,5)) +
theme_minimal() +
labs(title = "Hexbin Plot of Normalized Depth vs CNV Size (Truncated Y axis)",
subtitle = "Panels by interval of Repeat Fraction",
x = "CNV Size",
y = "Normalized Depth")
p / oMostly, the CNVs with very high smooth_depth are small.
The CNVs whose norm_depth is in between the depth thresholds are small. (These are the ones that the norm_depth and smooth_depth don’t coincide).
Some dupCNVs are large and have very high depth (the ones that are neither in the vertical cluster of small CNVs and high depth or the horizontal cluster of large CNVs and depth around 2):
filter(cnv_calls, region_size > 50000, smooth_depth > 2.5)%>%
select(sample, lineage, chromosome, norm_depth, smooth_depth, region_size, repeat_fraction)| sample | lineage | chromosome | norm_depth | smooth_depth | region_size | repeat_fraction |
|---|---|---|---|---|---|---|
| SRS885892 | VNBI | chr04 | 4.39 | 4.53 | 212500 | 0.03 |
| SRS417676 | VNI | chr04 | 2.83 | 2.84 | 52500 | 0.08 |
| SRS417676 | VNI | chr09 | 3.18 | 3.59 | 191000 | 0.02 |
| ERS1142739 | VNI | chr05 | 3.11 | 3.25 | 55000 | 0.10 |
| ERS2541126 | VNI | chr13 | 2.50 | 2.62 | 130500 | 0.12 |
| ERS542397 | VNI | chr04 | 2.76 | 2.77 | 705000 | 0.03 |
| ERS2541331 | VNI | chr04 | 3.75 | 3.79 | 82500 | 0.01 |
| ERS2541212 | VNI | chr13 | 2.66 | 2.67 | 130000 | 0.12 |
| ERS542490 | VNI | chr04 | 3.23 | 3.25 | 344500 | 0.06 |
| ERS2541064 | VNI | chr06 | 5.10 | 5.18 | 65500 | 0.03 |
The CNVs with the highest Normalized Depth (smooth or not), are highly repetitive.
There are many delCNVs completely repetitive. The majority of the dupCNVs are not repetitive.
Keep only the CNVs labeled as duplications (dupCNVs)
dup_calls <- filter(cnv_calls, cnv == "duplication")To filter out the dupCNVs that have very large Normalized Depth. Here I make groups of CNVs that start at the same position in the same chromosome.
dup_summary <- dup_calls %>%
group_by(lineage, chromosome, start, end, region_size) %>%
summarize(max_depth = max(norm_depth),
median_depth = median(norm_depth),
n = n(),
median_repeat = median(repeat_fraction))The dupCNVs groups with max depth larger than 5:
summary_large_dups <- dup_summary %>%
filter((max_depth > 5))%>%
arrange(desc(n))
head(summary_large_dups)| lineage | chromosome | start | end | region_size | max_depth | median_depth | n | median_repeat |
|---|---|---|---|---|---|---|---|---|
| VNI | chr01 | 339501 | 351000 | 11500 | 9.63 | 6.090 | 495 | 0.00 |
| VNI | chr02 | 274001 | 282000 | 8000 | 113.87 | 50.300 | 440 | 0.65 |
| VNI | chr02 | 274501 | 281500 | 7000 | 100.82 | 45.080 | 335 | 0.60 |
| VNI | chr10 | 582001 | 587500 | 5500 | 6.47 | 4.635 | 150 | 0.47 |
| VNI | chr02 | 274501 | 282000 | 7500 | 74.77 | 54.430 | 57 | 0.63 |
| VNI | chr01 | 1 | 3000 | 3000 | 36.98 | 8.070 | 25 | 0.10 |
Filter out duplications that are part of the groups in the previous table.
This way of filtering removes individual CNVs that not necessarily have depth grater than 5 but other samples in the lineage have a high depth CNV in the same region.
dup_calls_filtered <- dup_calls %>%
filter(!paste(start, end, chromosome, lineage) %in%
paste(summary_large_dups$start, summary_large_dups$end, summary_large_dups$chromosome, summary_large_dups$lineage))n <- ggplot(dup_calls_filtered, aes(x = chromosome, y = norm_depth, color = chromosome)) +
geom_quasirandom()+
facet_wrap(~cut(repeat_fraction, breaks = 4), ncol = 1) +
theme_minimal() +
ylim(0,5)+
theme(legend.position = "none", axis.text.x = element_text(angle = 90))+
labs(title = "Distribution of Normalized Depth of dupCNVs",
subtitle = "Panels by interval of Repeat Fraction",
y = "Normalized Depth",
x = "Chromosome")
s <- ggplot(dup_calls_filtered, aes(x = chromosome, y = smooth_depth, color = chromosome)) +
geom_quasirandom()+
facet_wrap(~cut(repeat_fraction, breaks = 4), ncol = 1) +
theme_minimal() +
ylim(0,5)+
theme(legend.position = "none", axis.text.x = element_text(angle = 90))+
labs(title = "Distribution of Smooth Normalized Depth of dupCNVs",
subtitle = "Panels by interval of Repeat Fraction",
y = "Smooth Normalized Depth",
x = "Chromosome")
n | srepeat_threshold = 0.25summary_repetitive_dups <- dup_summary %>%
filter((median_repeat > 0.25))%>%
arrange(desc(n))dup_calls_filtered <- dup_calls_filtered %>%
filter(!paste(start, end, chromosome, lineage) %in%
paste(summary_repetitive_dups$start, summary_repetitive_dups$end, summary_repetitive_dups$chromosome, summary_repetitive_dups$lineage))n <- ggplot(dup_calls_filtered, aes(x = chromosome, y = norm_depth, color = chromosome)) +
geom_quasirandom()+
theme_minimal() +
ylim(0,5)+
theme(legend.position = "none", axis.text.x = element_text(angle = 90))+
labs(title = "Distribution of Normalized Depth of dupCNVs",
y = "Normalized Depth",
x = "Chromosome")
s <- ggplot(dup_calls_filtered, aes(x = chromosome, y = smooth_depth, color = chromosome)) +
geom_quasirandom()+
theme_minimal() +
ylim(0,5)+
theme(legend.position = "none", axis.text.x = element_text(angle = 90))+
labs(title = "Distribution of Smooth Normalized Depth of dupCNVs",
y = "Smooth Normalized Depth",
x = "Chromosome")
n | sThe Normalized median depth of chromosomes (norm_chrom_median) is the normalized median of the depth of the positions in the whole chromosome. (First the median was calculated and then normalized). The Normalized mean depth of chromosomes (norm_chrom_mean) is the normalized mean of the depth of the positions in the whole chromosome. (First the mean was calculated and then normalized).
depth_by_chrom_good_desjardins <- read.delim(
depth_by_chrom_good_desj_path,
header=TRUE, sep="\t")
depth_by_chrom_good_ashton <- read.delim(
depth_by_chrom_good_ashton_path,
header=TRUE, sep="\t")
depth_by_chrom_good <- rbind(depth_by_chrom_good_desjardins, depth_by_chrom_good_ashton)
depth_by_chrom <- depth_by_chrom_good %>%
select(sample, accession, norm_chrom_median, norm_chrom_mean)
head(depth_by_chrom)| sample | accession | norm_chrom_median | norm_chrom_mean |
|---|---|---|---|
| SRS404518 | CP003820.1 | 0.98 | 0.97 |
| SRS404518 | CP003821.1 | 0.98 | 1.12 |
| SRS404518 | CP003822.1 | 0.99 | 1.01 |
| SRS404518 | CP003823.1 | 1.00 | 0.99 |
| SRS404518 | CP003824.1 | 0.99 | 0.98 |
| SRS404518 | CP003825.1 | 1.00 | 0.99 |
med <- ggplot(depth_by_chrom)+
geom_histogram(aes(norm_chrom_median), binwidth = 0.01)+
scale_x_continuous(breaks = seq(0,max(depth_by_chrom$norm_chrom_median), by = 0.1)) +
theme_minimal() +
labs(title = "Histogram of Normalized median depth of chromosomes",
y = "Count",
x = "Normalized median depth of chromosomes")
mea <- ggplot(depth_by_chrom)+
geom_histogram(aes(norm_chrom_mean), binwidth = 0.01)+
scale_x_continuous(breaks = seq(0,max(depth_by_chrom$norm_chrom_median), by = 0.1)) +
theme_minimal() +
labs(title = "Histogram of Normalized mean depth of chromosomes",
y = "Count",
x = "Normalized mean depth of chromosomes")
med / meamed <- ggplot(filter(depth_by_chrom, norm_chrom_median > 1.2 ))+
geom_histogram(aes(norm_chrom_median), binwidth = 0.01)+
scale_x_continuous(breaks = seq(0,max(depth_by_chrom$norm_chrom_median), by = 0.1)) +
scale_y_continuous(
breaks = seq(0, max(table(round(depth_by_chrom$norm_chrom_median ))) , 1)
)+
theme_minimal() +
theme(panel.grid.minor = element_blank())+
labs(title = "Histogram of Normalized median depth of chromosomes",
subtitle = "Only values above 1.2",
y = "Count",
x = "Normalized median depth of chromosomes")
mea <- ggplot(filter(depth_by_chrom, norm_chrom_mean > 1.2 ))+
geom_histogram(aes(norm_chrom_mean), binwidth = 0.01)+
scale_x_continuous(breaks = seq(0,max(depth_by_chrom$norm_chrom_mean), by = 0.1)) +
#scale_y_continuous(breaks = seq(0, max(table(round(depth_by_chrom$norm_chrom_mean ))) , 1))+
theme_minimal() +
theme(panel.grid.minor = element_blank())+
labs(title = "Histogram of Normalized mean depth of chromosomes",
subtitle = "Only values above 1.2",
y = "Count",
x = "Normalized mean depth of chromosomes")
med / meaChromosomes with very high median depth:
left_join(depth_by_chrom, chromosome_names, by = "accession")%>%
left_join(metadata, by = c("sample", "lineage"))%>%
select(lineage, sample, strain, chromosome, norm_chrom_median, norm_chrom_mean)%>%
filter(norm_chrom_median > 2.2)%>%
arrange(norm_chrom_median)| lineage | sample | strain | chromosome | norm_chrom_median | norm_chrom_mean |
|---|---|---|---|---|---|
| VNI | ERS1142697 | 20427_2#6 | chr12 | 2.24 | 2.20 |
| VNI | ERS2541126 | BMD3144 | chr13 | 2.24 | 2.23 |
| VNBI | SRS885841 | NRHc5009.REL.INI | chr14 | 2.25 | 2.23 |
| VNI | SRS404481 | Bt139 | chr13 | 2.27 | 2.20 |
| VNI | SRS885916 | PMHc1031A.ENR.INI.LP | chr12 | 2.31 | 2.34 |
| VNBII | SRS881180 | PMHc1029.ENR.STOR | chr12 | 2.45 | 2.42 |
| VNI | ERS2541212 | 04CN-03-039 | chr13 | 2.45 | 2.39 |
| VNI | ERS542397 | 14936_1#6 | chr04 | 2.74 | 2.34 |
The dataframe all_metrics contains a row per individual called duplication with the information about the CNV and the chromosome of the sample. For the chromosome-sample without any called duplication, the columns related to CNVs have NAs.
all_metrics <- left_join(depth_by_chrom, dup_calls_filtered,
by = c("sample", "accession"))%>%
select(-chromosome, -lineage)%>%
left_join(chromosome_lengths, by = "accession") %>%
left_join(chromosome_names,by ="accession") %>%
left_join(metadata, by = c("sample", "lineage"))
head(all_metrics)| sample | accession | norm_chrom_median | norm_chrom_mean | start | end | cnv | region_size | depth | norm_depth | smooth_depth | repeat_fraction | overlap_bp | feature_id | length | lineage | chromosome | strain | source | dataset | vni_subdivision | samples_in_dataset_lineage | samples_in_lineage | total_samples |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| SRS404518 | CP003820.1 | 0.98 | 0.97 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 2291499 | VNI | chr01 | Bt134 | Clinical | Desjardins | VNIa-5 | 185 | 853 | 1055 |
| SRS404518 | CP003821.1 | 0.98 | 1.12 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 1621675 | VNI | chr02 | Bt134 | Clinical | Desjardins | VNIa-5 | 185 | 853 | 1055 |
| SRS404518 | CP003822.1 | 0.99 | 1.01 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 1575141 | VNI | chr03 | Bt134 | Clinical | Desjardins | VNIa-5 | 185 | 853 | 1055 |
| SRS404518 | CP003823.1 | 1.00 | 0.99 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 1084805 | VNI | chr04 | Bt134 | Clinical | Desjardins | VNIa-5 | 185 | 853 | 1055 |
| SRS404518 | CP003824.1 | 0.99 | 0.98 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 1814975 | VNI | chr05 | Bt134 | Clinical | Desjardins | VNIa-5 | 185 | 853 | 1055 |
| SRS404518 | CP003825.1 | 1.00 | 0.99 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 1422463 | VNI | chr06 | Bt134 | Clinical | Desjardins | VNIa-5 | 185 | 853 | 1055 |
And keep the rest of the chromosome metrics.
chrom_metrics <- all_metrics %>%
group_by(sample, strain, chromosome, norm_chrom_mean, norm_chrom_median, length,
accession, source, lineage,
samples_in_lineage, samples_in_dataset_lineage,
total_samples,dataset) %>%
summarise(total_cnv_size = sum(region_size),
n_cnvs = sum(!is.na(cnv)), #fix this
first = min(start),
last = max(end),
mean_cnv_depth = round(mean(norm_depth),2),
median_cnv_depth = round(median(norm_depth),2),
repeat_size = sum(overlap_bp)) %>%
ungroup()%>%
mutate(dup_coverage_percent = round((total_cnv_size / length) * 100, 2),
dup_span_size = last - first,
dup_span_percent = round((dup_span_size / length) * 100, 2),
dup_repeat_percent = round((repeat_size / length) * 100, 2),
chromosome = as.factor(chromosome),
dup_coverage_percent = ifelse(is.na(dup_coverage_percent), 0, dup_coverage_percent),
dup_span_percent = ifelse(is.na(dup_span_percent), 0, dup_span_percent))
head(chrom_metrics)| sample | strain | chromosome | norm_chrom_mean | norm_chrom_median | length | accession | source | lineage | samples_in_lineage | samples_in_dataset_lineage | total_samples | dataset | total_cnv_size | n_cnvs | first | last | mean_cnv_depth | median_cnv_depth | repeat_size | dup_coverage_percent | dup_span_size | dup_span_percent | dup_repeat_percent |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ERS1142690 | 20949_2#1 | chr01 | 1.00 | 0.98 | 2291499 | CP003820.1 | Clinical | VNI | 853 | 668 | 1055 | Ashton | NA | 0 | NA | NA | NA | NA | NA | 0 | NA | 0 | NA |
| ERS1142690 | 20949_2#1 | chr02 | 1.20 | 0.96 | 1621675 | CP003821.1 | Clinical | VNI | 853 | 668 | 1055 | Ashton | NA | 0 | NA | NA | NA | NA | NA | 0 | NA | 0 | NA |
| ERS1142690 | 20949_2#1 | chr03 | 0.99 | 0.98 | 1575141 | CP003822.1 | Clinical | VNI | 853 | 668 | 1055 | Ashton | NA | 0 | NA | NA | NA | NA | NA | 0 | NA | 0 | NA |
| ERS1142690 | 20949_2#1 | chr04 | 1.00 | 1.00 | 1084805 | CP003823.1 | Clinical | VNI | 853 | 668 | 1055 | Ashton | NA | 0 | NA | NA | NA | NA | NA | 0 | NA | 0 | NA |
| ERS1142690 | 20949_2#1 | chr05 | 0.97 | 0.98 | 1814975 | CP003824.1 | Clinical | VNI | 853 | 668 | 1055 | Ashton | NA | 0 | NA | NA | NA | NA | NA | 0 | NA | 0 | NA |
| ERS1142690 | 20949_2#1 | chr06 | 1.00 | 1.00 | 1422463 | CP003825.1 | Clinical | VNI | 853 | 668 | 1055 | Ashton | NA | 0 | NA | NA | NA | NA | NA | 0 | NA | 0 | NA |
The total number of chromosomes in all samples in the dataset is: 14770
The total number of chromosomes in all samples with called duplications is: 2384
ggplot(filter(chrom_metrics, n_cnvs > 0))+
geom_histogram(aes(n_cnvs), binwidth = 1)+
scale_x_continuous() +
theme_minimal() +
labs(title = "Histogram of number of dupCNVs per chromosome",
y = "Count",
x = "Number of dupCNVs")p <- ggplot(filter(chrom_metrics, n_cnvs > 0))+
geom_quasirandom(aes(x = chromosome, y = dup_coverage_percent))+
theme_minimal() +
labs(title = "Percent of Chromosome Covered by dupCNVs per Chromosome",
subtitle = "Only chromosomes with at least 1 dupCNV",
y = "Percent of Chromosome Covered by dupCNVs",
x = "Chromosome")
ggMarginal(p, type = "histogram")p <- ggplot(filter(chrom_metrics, dup_coverage_percent > 10))+
geom_quasirandom(aes(x = chromosome, y = dup_coverage_percent))+
theme_minimal() +
labs(title = "Percent of Chromosome Covered by dupCNVs per Chromosome",
subtitle = "Only chromosomes with at least 10% covered",
x = "Chromosome",
y = "Percent of Chromosome Covered by dupCNVs")
ggMarginal(p, type = "histogram")
### Percentage of chromosome Spanned by dupCNVs
p <- ggplot(filter(chrom_metrics, n_cnvs > 0))+
geom_quasirandom(aes(x = chromosome, y = dup_span_percent))+
theme_minimal() +
labs(title = "Percent of Chromosome Spanned by dupCNVs per Chromosome",
subtitle = "Only chromosomes with at least 1 dupCNV",
y = "Percent of Chromosome Spanned by dupCNVs",
x = "Chromosome")
ggMarginal(p, type = "histogram")p <- ggplot(filter(chrom_metrics, dup_coverage_percent > 10))+
geom_quasirandom(aes(x = chromosome, y = dup_span_percent))+
theme_minimal() +
labs(title = "Percent of Chromosome Spanned by dupCNVs per Chromosome",
subtitle = "Only chromosomes with at least 10% spanned",
y = "Percent of Chromosome Spanned by dupCNVs",
x = "Chromosome")
ggMarginal(p, type = "histogram")ggplot(filter(chrom_metrics, n_cnvs > 0), aes(x = dup_coverage_percent, y = dup_span_percent)) +
geom_hex(bins=50) +
scale_x_continuous(labels = scales::comma) +
geom_abline(slope = 1, intercept = 0, linetype = "solid", color = "lightgray") +
scale_fill_viridis_c(name = "Count\nLog10", direction = -1, trans = "log10") +
theme_minimal() +
theme(legend.position = "right")+
labs(title = "Percent of Chromosome Spanned by dupCNVs vs Covered by dupCNVs",
y = "Percent of Chromosome Spanned by dupCNVs",
x = "Percent of Chromosome Covered by dupCNVs")ggplot(filter(chrom_metrics, n_cnvs > 0), aes(x = dup_coverage_percent, y = n_cnvs)) +
geom_hex(bins=50) +
scale_x_continuous(labels = scales::comma) +
scale_fill_viridis_c(name = "Count\nLog10", direction = -1, trans = "log10") +
theme_minimal() +
theme(legend.position = "right")+
labs(title = "Number of CNVs vs Percent of Chromosome Covered by dupCNVs ",
y = "Number of CNVs",
x = "Percent of Chromosome Covered by dupCNVs")ggplot(chrom_metrics, aes(x = dup_coverage_percent, y = dup_span_percent)) +
geom_point(aes(color = n_cnvs)) +
scale_x_continuous(labels = scales::comma) +
scale_color_distiller(palette = "GnBu", direction = -1, trans = "log2", name = "Number\n of CNVs") +
theme_minimal() +
theme(legend.position = "right")+
labs(title = "Percent of Chromosome Covered by CNVs vs Spanned by CNVs",
y = "Percent of Chromosome Spanned by CNVs",
x = "Percent of Chromosome Covered by CNVs")ggplot(chrom_metrics, aes(x = dup_coverage_percent, y = dup_span_percent)) +
geom_point(aes(color = norm_chrom_median)) +
scale_x_continuous(labels = scales::comma) +
scale_color_viridis_c(name = "Normalized Median\nDepth of Chromosome", direction = -1) +
theme_minimal() +
theme(legend.position = "right")+
labs(title = "Percent of Chromosome Covered by CNVs vs Spanned by CNVs",
y = "Percent of Chromosome Spanned by CNVs",
x = "Percent of Chromosome Covered by CNVs")ggplot(chrom_metrics, aes(x = dup_coverage_percent, y = dup_span_percent)) +
geom_point(aes(color = norm_chrom_mean)) +
facet_wrap(~cut(norm_chrom_median, breaks = 6), nrow = 2) +
scale_x_continuous(labels = scales::comma) +
scale_color_viridis_c(name = "Normalized Mean\nDepth of Chromosome", direction = -1) +
theme_minimal() +
theme(legend.position = "right")+
labs(title = "Percent of Chromosome Covered by CNVs vs Spanned by CNVs",
y = "Percent of Chromosome Spanned by CNVs",
x = "Percent of Chromosome Covered by CNVs")The Normalized depth (norm_depth) is the median of the normalized depth of the windows that are part of each dupCNV region.
The Median depth of dupCNV (median_cnv_depth) is the median of the Normalized depth of all dupCNVs in the chromosome. There are values bellow the threshold of depth to call a Duplication CNV because that threshold was applied to the smoothed values and this are the raw values.
The Normalized median depth of chromosomes (norm_chrom_median) is the normalized median of the depth of the positions in the whole chromosome. (First the median was calculated and then normalized).
The Percent of Chromosome Covered by dupCNVs (dup_coverage_percent) is the percentage of the full length of the chromosome that is part of called dupCNVs.
The Percent of Chromosome Spanned by dupCNVs (dup_span_percent) is the percentage of the full length of the chromosome that is in between the leftmost and rightmost dupCNVs.
color_range <- paste("2.6 -", max(chrom_metrics$median_cnv_depth, na.rm = TRUE))
colors <- c("#FDE725", "#1F9E89", "#440154")
names(colors) <-c("0 - 1.6", "1.6 - 2.6", color_range)
ggplot(chrom_metrics, aes(x = dup_coverage_percent, y = norm_chrom_median)) +
geom_hline(yintercept = c(1, 2), color = "black", linetype = "dashed") +
geom_point(aes(color = cut(median_cnv_depth,
breaks = c(-Inf, 1.6, 2.6, Inf),
labels = c("0 - 1.6", "1.6 - 2.6", color_range))),
alpha = 0.5) +
scale_color_manual(values = colors,
name = "Median dupCNV Depth") +
theme_minimal() +
theme(legend.position = "right") +
labs(title = "Percent of Chromosome Covered by dupCNVs vs. Depth of Chromosome",
y = "Normalized Median Depth of Chromosome",
x = "Percent of Chromosome Covered by dupCNVs")color_range <- paste("2.6 -", max(chrom_metrics$mean_cnv_depth, na.rm = TRUE))
colors <- c("#FDE725", "#1F9E89", "#440154")
names(colors) <-c("0 - 1.6", "1.6 - 2.6", color_range)
ggplot(chrom_metrics, aes(x = dup_coverage_percent, y = norm_chrom_mean)) +
geom_hline(yintercept = c(1, 2), color = "black", linetype = "dashed") +
geom_point(aes(color = cut(mean_cnv_depth,
breaks = c(-Inf, 1.6, 2.6, Inf),
labels = c("0 - 1.6", "1.6 - 2.6", color_range))),
alpha = 0.5) +
scale_color_manual(values = colors,
name = "Mean dupCNV Depth") +
theme_minimal() +
theme(legend.position = "right") +
labs(title = "Percent of Chromosome Covered by dupCNVs vs. Depth of Chromosome",
y = "Normalized Mean Depth of Chromosome",
x = "Percent of Chromosome Covered by dupCNVs")color_range <- paste("2.6 -", max(chrom_metrics$mean_cnv_depth, na.rm = TRUE))
colors <- c("#FDE725", "#1F9E89", "#440154")
names(colors) <-c("0 - 1.6", "1.6 - 2.6", color_range)
ggplot(chrom_metrics, aes(x = norm_chrom_median, y = norm_chrom_mean)) +
geom_hline(yintercept = c(1, 2), color = "black", linetype = "dashed") +
geom_point(aes(color = cut(mean_cnv_depth,
breaks = c(-Inf, 1.6, 2.6, Inf),
labels = c("0 - 1.6", "1.6 - 2.6", color_range))),
alpha = 0.5) +
scale_color_manual(values = colors,
name = "Mean dupCNV Depth") +
theme_minimal() +
theme(legend.position = "right") +
labs(title = "Normalized Mean Depth vs. Normalized Median Depth of Chromosome",
y = "Normalized Mean Depth of Chromosome",
x = "Normalized Median Depth of Chromosome")Filter by percent of chromosome in dupCNV or chromosome depth
size_threshold <- 50
depth_threshold <- 1.55chrom_metrics_filtered <- chrom_metrics %>%
filter(dup_coverage_percent >= size_threshold | norm_chrom_median >= 1.55)ggplot(chrom_metrics_filtered, aes(x = dup_coverage_percent, y = norm_chrom_median)) +
geom_hline(yintercept = c(1, 2), color = "black", linetype = "dashed") +
geom_point(aes(color = cut(median_cnv_depth,
breaks = c(-Inf, 1.6, 2.6, Inf),
labels = c("0 - 1.6", "1.6 - 2.6", color_range))),
alpha = 0.5) +
scale_color_manual(values = colors,
name = "Median dupCNV Depth") +
theme_minimal() +
theme(legend.position = "right") +
labs(title = "Percent of Chromosome Covered by dupCNVs vs. Depth of Chromosome",
y = "Normalized Median Depth of Chromosome",
x = "Percent of Chromosome Covered by dupCNVs")write_tsv(chrom_metrics_filtered, duplications_out_path)chrom_metrics_filtered %>%
filter(norm_chrom_median > 2.5)%>%
select(sample, strain, chromosome, norm_chrom_median, dup_coverage_percent)| sample | strain | chromosome | norm_chrom_median | dup_coverage_percent |
|---|---|---|---|---|
| ERS542397 | 14936_1#6 | chr04 | 2.74 | 64.99 |